library(ggplot2); # For some graphs
library(FactoMineR); # For PCA
library(factoextra); # For PCA
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(caret); # For feature plots
## Loading required package: lattice
library(e1071); # For SVM
library(nnet); # For Neural nets
library(pROC);
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var
library(cluster) # For k-means
library(fpc) # For k-means
library(seriation) # For k-means
## Registered S3 methods overwritten by 'registry':
##   method               from 
##   print.registry_field proxy
##   print.registry_entry proxy
## 
## Attaching package: 'seriation'
## The following object is masked from 'package:lattice':
## 
##     panel.lines
library(ggpubr) # For clustering visualization
library(TSclust) # For clustering Evaluation
## Loading required package: pdc
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo
library(knitr); # For table printing
# Create a vector containing all attribute names
attribute.names = c()
for(pixel.number in 1:9) {
  for(attribute.suffix in c("Red", "Green", "NIR1", "NIR2")) {
    attribute.name = paste("p", pixel.number, ".", attribute.suffix, sep="")
    attribute.names = c(attribute.names, attribute.name)
  }
}
attribute.names = c(attribute.names, "class")

# Create a vector containing all class names
class.names = c("red soil", "cotton crop", "grey soil", "damp grey soil", "soil with vegetation stubble", "very damp grey soil")

# Load the dataset from the csv file and cast the label to factor
dataset = data.frame(read.csv("dataset.csv", col.names = attribute.names, sep=" "))
dataset$class = factor(dataset$class, labels = class.names)

# Determine the input and the output space
dataset.input = dataset[, 1:ncol(dataset)-1]
dataset.output = dataset$class
# Determine some statistical measures on the input attributes
dataset.mean = sapply(dataset.input, mean)
dataset.sd = sapply(dataset.input, sd)
dataset.var = sapply(dataset.input, var)
dataset.min = sapply(dataset.input, min)
dataset.max = sapply(dataset.input, max)
dataset.median = sapply(dataset.input, median)
dataset.range = sapply(dataset.input, range)
dataset.quantile = sapply(dataset.input, quantile)
# Determine class frequencies
class.frequencies = data.frame(table(dataset.output))
colnames(class.frequencies) = c("class", "frequencies")
class.frequencies$percentages = paste(format((class.frequencies$frequencies / sum(class.frequencies$frequencies)) * 100, digits = 2), "%", sep = "")
ggplot(data = class.frequencies, aes(x = class, y = frequencies, fill = class)) + geom_bar(stat = "identity") + geom_text(aes(label = percentages), vjust = 1.5, colour = "white")

ggplot(data = class.frequencies, aes(x = "", y = frequencies, fill = class)) + geom_bar(stat = "identity", width = 1, color = "white") + coord_polar("y", start = 0)

## Separabilità I valori relativi allo stesso canale tra pixel adiacenti risultano correlati (come evidente dai grafici), questo perche’ il dataset e’ costruito come una sliding window quadrata 3x3 su una singola immagine satellitare. Ci si aspetta quindi una varianza piuttosto contenuta sui singoli valori e la sopra citata correlazione.

TODO: Aggiungere titoli ai featureplot

featurePlot(dataset.input[, c("p5.Red", "p5.Green", "p5.NIR1")], dataset.output, plot="pairs", scales=list((relation="free"), y=list(relation="free")), auto.key=list(columns=3))

featurePlot(dataset.input[, c("p5.Red", "p6.Red", "p4.Red")], dataset.output, plot="pairs", scales=list((relation="free"), y=list(relation="free")), auto.key=list(columns=3))

featurePlot(dataset.input[, c("p5.Green", "p6.Green", "p4.Green")], dataset.output, plot="pairs", scales=list((relation="free"), y=list(relation="free")), auto.key=list(columns=3))

featurePlot(dataset.input[, c("p5.Green", "p2.Green", "p8.Green")], dataset.output, plot="pairs", scales=list((relation="free"), y=list(relation="free")), auto.key=list(columns=3))

featurePlot(dataset.input[, c("p5.Red", "p2.Red", "p8.Red")], dataset.output, plot="pairs", scales=list((relation="free"), y=list(relation="free")), auto.key=list(columns=3))

featurePlot(dataset.input[, c("p5.NIR1", "p2.NIR1", "p8.NIR1")], dataset.output, plot="pairs", scales=list((relation="free"), y=list(relation="free")), auto.key=list(columns=3))

PCA

Data la correlazione rilevata precedentemente e l’alto numero di attributi in input sembra più che opportuno eseguire una PCA sul dataset al fine di ridurre la dimensione dello spazio di input.

# Execute the PCA. Extract only the first four principal components
pca.results <- PCA(dataset.input, scale.unit = TRUE, ncp = 4, graph = FALSE)

# Get the eigenvalues and plot them. The first four components explain about 92% of variance
pca.results.eig.val <- get_eigenvalue(pca.results)
fviz_eig(pca.results, addlabels = TRUE, ylim = c(0, 50))

# Show the correlation between the variables using the first two principal components
# Many variable are positively correlated
fviz_pca_var(pca.results, col.var = "black")

Analizzando le coordinate per i singoli attributi si riesce a capire i 4 cluster rappresentano i 4 tipi di variabili in gioco (Red, Green NIR1, NIR2)

# Extract the principal components
dataset.input.pca = data.frame(get_pca_ind(pca.results)$coord)
dataset.pca = cbind(class=dataset$class, dataset.input.pca)

# Draw some feature plot to highlight the level of difficulty to recognise the various classes
featurePlot(x=dataset.input.pca, y=dataset.output, plot="density", scales=list(x=list(relation="free"), y=list(relation="free")), auto.key=list(columns=3))

featurePlot(x=dataset.input.pca, y=dataset.output, plot="pairs", auto.key=list(columns=3))

# Divide the dataset into trainset and testset
indexes = sample(2, nrow(dataset.pca), replace = TRUE, prob = c(0.7, 0.3))
temp.trainset = dataset.pca[indexes == 1,]
testset = dataset.pca[indexes == 2,]
testset.x = testset[, !names(testset) %in% c("class")]
testset.y = testset[,c("class")]
# Divide the trainset to obtain a validationset
indexes = sample(2, nrow(temp.trainset), replace = TRUE, prob = c(0.9, 0.1))
trainset = temp.trainset[indexes == 1,]
validationset = temp.trainset[indexes == 2,]
validationset.x = validationset[,!names(validationset) %in% c("class")]
validationset.y =  validationset[,c("class")]

SVM Training

svm.tuning.control = tune.control(sampling = "fix")
svm.tuning = tune.svm(class ~ ., data = trainset, kernel = "radial", gamma = 10^(-3:1), cost = 10^(-2:1), validation.x = validationset[,!names(validationset) %in% c("class")], validation.y = validationset[,c("class")], tunecontrol = svm.tuning.control, probability = T)
plot(svm.tuning)

svm.tuned = svm.tuning$best.model
svm.tuned.validation.prediction = predict(svm.tuned, validationset.x)
svm.tuned.validation.confusion_matrix = table(svm.tuned.validation.prediction, validationset.y)
svm.tuned.accuracy = sum(diag(svm.tuned.validation.confusion_matrix)) / sum(svm.tuned.validation.confusion_matrix)
sprintf("Tuned SVM accuracy is %2.1f%%", (1 - svm.tuning$best.performance) * 100)
## [1] "Tuned SVM accuracy is 89.8%"
sprintf("Best SVM parameters were gamma: %d cost: %d", svm.tuning$best.parameters$gamma, svm.tuning$best.parameters$cost)
## [1] "Best SVM parameters were gamma: 1 cost: 1"
print(svm.tuned.validation.confusion_matrix)
##                                validationset.y
## svm.tuned.validation.prediction red soil cotton crop grey soil damp grey soil
##    red soil                           99           0         0              0
##    cotton crop                         0          51         0              1
##    grey soil                           0           0        95              8
##    damp grey soil                      0           0         2             25
##    soil with vegetation stubble        0           2         0              1
##    very damp grey soil                 0           1         2             11
##                                validationset.y
## svm.tuned.validation.prediction soil with vegetation stubble
##    red soil                                                3
##    cotton crop                                             3
##    grey soil                                               0
##    damp grey soil                                          0
##    soil with vegetation stubble                           45
##    very damp grey soil                                     4
##                                validationset.y
## svm.tuned.validation.prediction very damp grey soil
##    red soil                                       0
##    cotton crop                                    0
##    grey soil                                      2
##    damp grey soil                                 4
##    soil with vegetation stubble                   3
##    very damp grey soil                           98
nnet.tuning.control = tune.control(sampling = 'fix', performances = T)
nnet.tuning = tune.nnet(class ~., data = trainset, size = c(2:4), validation.x = validationset.x, validation.y = validationset.y, decay = c(0, 2^-2:1))
nnet.tuned = nnet.tuning$best.model
plot(nnet.tuning)

acc = 1 - nnet.tuning$best.performance
sprintf("Tuned neuralnet overall accuracy is %2.1f%%", acc * 100)
## [1] "Tuned neuralnet overall accuracy is 85.9%"
sprintf("Tuned neuralnet best parameters were size: %d decay: %d", nnet.tuning$best.parameters$size, nnet.tuning$best.parameters$decay)
## [1] "Tuned neuralnet best parameters were size: 4 decay: 0"
predictions = predict(nnet.tuning$best.model, testset.x)
predictions.classes = class.names[max.col(predictions)]
errors = predictions.classes != testset.y
acc = length(which(errors))/length(errors)
nnet.tuned.validation.predictions = predictions.classes
nnet.tuned.validation.confusion_matrix = table(nnet.tuned.validation.predictions, testset.y)
print(nnet.tuned.validation.confusion_matrix)
##                                  testset.y
## nnet.tuned.validation.predictions red soil cotton crop grey soil damp grey soil
##      cotton crop                         0         188         0              0
##      damp grey soil                      2           1        24             51
##      grey soil                           8           0       394             40
##      red soil                          430           0         0              3
##      soil with vegetation stubble        7          14         1             13
##      very damp grey soil                 0           1         4             67
##                                  testset.y
## nnet.tuned.validation.predictions soil with vegetation stubble
##      cotton crop                                             8
##      damp grey soil                                          4
##      grey soil                                               1
##      red soil                                                5
##      soil with vegetation stubble                          158
##      very damp grey soil                                    17
##                                  testset.y
## nnet.tuned.validation.predictions very damp grey soil
##      cotton crop                                    0
##      damp grey soil                                54
##      grey soil                                     15
##      red soil                                       0
##      soil with vegetation stubble                  22
##      very damp grey soil                          397

Confronto modelli

svm.tuned.validation.predictions = predict(svm.tuned, validationset.x, probability = T)
svm.tuned.validation.predictions.probs = attr(svm.tuned.validation.predictions, "probabilities")
svm.roc = multiclass.roc(validationset.y, svm.tuned.validation.predictions.probs)
nnet.tuned.validation.predictions = predict(nnet.tuned, validationset.x, probability = T)
nnet.roc = multiclass.roc(validationset.y, nnet.tuned.validation.predictions)
sprintf("SVM ROC AUC: %f", svm.roc$auc)
## [1] "SVM ROC AUC: 0.980260"
sprintf("NNET ROC AUC: %f", nnet.roc$auc)
## [1] "NNET ROC AUC: 0.972077"

Calcolo metriche

source("compute_f_scores.R")

weights = list()
for (cls in class.names){
  count = class.frequencies[class.frequencies$class == cls,"frequencies"]
  weights[[cls]] = count
}
total_instances_count = Reduce('+', weights)
weights = mapply('/', weights, total_instances_count)
fscores = compute_f_scores(class.names, svm.tuned.validation.confusion_matrix, weights)

svm.macro.fscore = Reduce('+', fscores$fscores) / length(class.names)
svm.micro.fscore = Reduce('+', fscores$fscores.weighted)

sprintf("SVM macro F-Score: %1.3f", svm.macro.fscore)
## [1] "SVM macro F-Score: 0.872"
sprintf("SVM micro F-Score: %1.3f", svm.micro.fscore)
## [1] "SVM micro F-Score: 0.896"
per_class_metrics = data.frame("Precision" = unname(unlist(fscores$precisions)), "Recall" = unname(unlist(fscores$recalls)), "Fscore" = unname(unlist(fscores$fscores)))
rownames(per_class_metrics) = names(fscores$precisions)
per_class_metrics$Precision = per_class_metrics$Precision * 100
per_class_metrics$Recall = per_class_metrics$Recall * 100
print(per_class_metrics)
##                              Precision    Recall    Fscore
## red soil                      97.05882 100.00000 0.9850746
## cotton crop                   92.72727  94.44444 0.9357798
## grey soil                     90.47619  95.95960 0.9313725
## damp grey soil                80.64516  54.34783 0.6493506
## soil with vegetation stubble  88.23529  81.81818 0.8490566
## very damp grey soil           84.48276  91.58879 0.8789238
fscores = compute_f_scores(class.names, nnet.tuned.validation.confusion_matrix, weights)
nnet.macro.fscore = Reduce('+', fscores$fscores) / length(class.names)
nnet.micro.fscore = Reduce('+', fscores$fscores.weighted)
sprintf("NNet macro F-Score: %1.3f", nnet.macro.fscore)
## [1] "NNet macro F-Score: 0.787"
sprintf("Net micro F-Score: %1.3f", nnet.micro.fscore)
## [1] "Net micro F-Score: 0.831"
per_class_metrics = data.frame("Precision" = unname(unlist(fscores$precisions)), "Recall" = unname(unlist(fscores$recalls)), "Fscore" = unname(unlist(fscores$fscores)))
rownames(per_class_metrics) = names(fscores$precisions)
per_class_metrics$Precision = per_class_metrics$Precision * 100
per_class_metrics$Recall = per_class_metrics$Recall * 100
print(per_class_metrics)
##                              Precision   Recall    Fscore
## red soil                      98.17352 96.19687 0.9717514
## cotton crop                   95.91837 92.15686 0.9400000
## grey soil                     86.02620 93.14421 0.8944381
## damp grey soil                37.50000 29.31034 0.3290323
## soil with vegetation stubble  73.48837 81.86528 0.7745098
## very damp grey soil           81.68724 81.35246 0.8151951